import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import sklearn as sk
import sklearn.naive_bayes as sknb
import sklearn.metrics as skm
import sklearn.cluster as skc
import sklearn.decomposition as skd
import sklearn.preprocessing as skp
import statsmodels.formula.api as smf
import statsmodels.api as sm
import sklearn.tree as sktree
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
sns.set_context("notebook") # make figures fit
from pylab import rcParams
import matplotlib.pyplot as plt
from statsmodels.graphics.mosaicplot import mosaic
from IPython.display import Image
from IPython.core.display import HTML
import sklearn.externals.six as sksix
import IPython.display as ipd
import os
import pydot_ng as pydot
# load the bokeh stuff
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()
from bokeh.layouts import column, row
from bokeh.charts import Scatter
from bokeh.charts import TimeSeries
from bokeh.charts import Bar
from bokeh.charts import Histogram
from bokeh.charts import BoxPlot
from bokeh.layouts import gridplot
from bokeh.models import HoverTool
from bokeh.models import ColumnDataSource
exec(open('useful.py').read())
# make the Pandas tables a little more readable
from IPython.core.display import HTML
css = open('style-table.css').read() + open('style-notebook.css').read()
HTML('<style>{}</style>'.format(css))
Master = pd.read_csv("Master.csv")
Salaries = pd.read_csv("Salaries.csv")
Teams = pd.read_csv("Teams.csv")
Batting = pd.read_csv("Batting.csv", parse_dates = ['yearID'])
Appearances = pd.read_csv("Appearances.csv", parse_dates=['yearID'])
#MasterSel will comprise of only the variables you are interested in
# I am particularly interested in weight/ might take birth into account
MasterSel = Master[['playerID','weight','height','birthCountry']]
# quickly examine Master Sel
MasterSel.head(2)
# looked at NA's there are only a few, so I feel comfortable excluding them
# from my analysis
MasterSel = MasterSel.dropna()
# Similiar to MasterSel I will narrow down the variables I want from
# the batting table
# BattingSel is also a cut down of the Batting csv
BattingSel = Batting[['playerID','yearID','teamID','G','AB','R','H','RBI','HR','2B','3B','SO']]
# for further analysis we will be checking
# batting statistics, if we don't have batting data for them we will drop them
# when merging columns will make sure to merge on only those players in batting
BattingSel = BattingSel.dropna()
# batting table has the items you want now
BattingSel.head(2)
# Positions is the appearances table cut down
# the variables in this dataframe are the number of times
# a player has played such position in that given year
# this will later give us the position of each player
Positions = Appearances[['yearID','playerID', 'G_all','G_p', 'G_c', 'G_1b', 'G_2b', 'G_3b', 'G_ss', 'G_of']]
Positions.head(2)
#set all indexes to "playerid" so you can later merge them
MasterSel = MasterSel.set_index('playerID')
BattingSel = BattingSel.set_index('playerID')
Positions= Positions.set_index('playerID')
# MasterPos will now have POSITIONAL data and weight,height, and birth country
MasterPos = pd.merge(MasterSel, Positions, left_index=True, right_index=True, how='inner')
#MasterPos.describe()
MasterPos.head()
# Creating a column to store the max position of games played
# then using pmax to create actual labels!
MasterPos['pmax'] = MasterPos[['G_p', 'G_c', 'G_1b', 'G_2b', 'G_3b', 'G_ss','G_of']].max(axis=1)
# Creating a column called position to store labels for classification
MasterPos['position']= ''
# creating LABELS for positional data
counter = 0
for row in MasterPos.iterrows():
if row[1]['pmax'] == row[1]['G_p']:
MasterPos.ix[counter,'position'] = 'Pitcher'
elif row[1]['pmax'] == row[1]['G_c']:
MasterPos.ix[counter,'position'] = 'Catcher'
elif row[1]['pmax'] == row[1]['G_1b']:
MasterPos.ix[counter,'position'] = 'FirstBase'
elif row[1]['pmax'] == row[1]['G_2b']:
MasterPos.ix[counter,'position'] = 'SecondBase'
elif row[1]['pmax'] == row[1]['G_3b']:
MasterPos.ix[counter,'position'] = 'ThirdBase'
elif row[1]['pmax'] == row[1]['G_ss']:
MasterPos.ix[counter,'position'] = 'ShortStop'
elif row[1]['pmax'] == row[1]['G_of']:
MasterPos.ix[counter,'position'] = 'Outfielder'
counter += 1
# MasterPos now has positional labels
# filter our the uneccessary columns that helped
# determine position for that year
MasterPos = MasterPos[['yearID','weight','height','birthCountry','G_all','position']]
# MasterPos.position.value_counts() examined positions
MasterPos.head(2) # now that masterpos has the information you would like
Position_df = MasterPos[['yearID','position']]
# you want the weight of each player
Weight_df = MasterPos[['weight','height','birthCountry']]
Weight_df = Weight_df.drop_duplicates()
weighthist = Histogram(MasterPos, 'weight', bins = 30, xlabel='Weight', ylabel='# of Players', plot_width =600, title='Weight Distribution',color = 'navy')
show(weighthist)
# the histogram looks the same before
# more importantly, we are examining the QQplot here to better visualize normality
fourPlot(MasterPos['weight'])
once I plotted the height's histogram and looked at the height's qqplot I confirmed that height was also very normal
heighthist = Histogram(MasterPos, 'height', bins = 20, xlabel='height', ylabel='# of Players', plot_width =600, title='Height Distribution',color = 'red')
show(heighthist)
fourPlot(MasterPos['height']) # height is also very normal! QQ plot looks good
## description of BMI
# what does their BMI look like
MasterPos['BMI'] = (MasterPos['weight']/np.power(MasterPos['height'],2)) *703
BMIHist = Histogram(MasterPos,'BMI',bins=30, ylabel='# of Players', title = 'BMI Distribution',color='purple')
show(BMIHist)
# anything below 18.5 is underweight, 25 overweight, and anything past 30 is obsese
perofBMI = [100 - stats.percentileofscore(MasterPos['BMI'],i) for i in [18.5,26,30,40]]
perofBMI
100 - perofBMI[0] # 0.025 are underweight
# 27% are overweight and 2.49 are considered obese
lets look at the average weight of baseball players every year because the data stems from 1871 - 2016 this will easier to plot and easier to interpret
# lets look at the average weight of baseball players every year
AvgLBTS = MasterPos.groupby('yearID')[['weight']].mean()
AvgLBTS.head(2)
# not very pretty, but this is the timeseries of average weight over time
ts = AvgLBTS["weight"]
plt.plot(ts)
r = AvgLBTS.rolling(window=10) # windows is you days you want of rolling ; the bigger the window the smaller the varability
AvgLBTS.weight.plot(color='gray')
r.mean().weight.plot(color='red')
AvgLBTSN_IX = AvgLBTS.reset_index()
AvgLBTSN_IX.head(2)
TS = TimeSeries(AvgLBTSN_IX,
x= "yearID", y='weight', title="Timeseries", ylabel='Average Weight',xlabel='Year',plot_width=400,color='LightBlue', legend=False)
show(TS)
AvgLBTSN_IX_2 = AvgLBTSN_IX.copy()
AvgLBTSN_IX_2['yearID'] = AvgLBTSN_IX_2['yearID'].dt.strftime('%Y')
#specify data source
source = ColumnDataSource(AvgLBTSN_IX_2)
# what pops up to hover
tooltips = [('weight', '@weight'),
('year', '@yearID')]
hover = HoverTool(tooltips=tooltips)
p = figure(plot_width=700,plot_height=450,title="MLB weight over the years", x_axis_label='Year',y_axis_label = "Average Weight (in Pounds)")
# Add the hover tool
p.add_tools(hover)
# # Populate glyphs
p.circle(x='yearID', y='weight', size=7, alpha=0.5, source=source)
show(p)
AvgLBPos = MasterPos.groupby(['yearID','position'])[['weight']].mean()
AvgLBPos = AvgLBPos.unstack().stack().T.stack()
PLOT = AvgLBPos.T.plot(title='Average Yearly Weight by Position')
PLOT.set_xlabel('Year')
PLOT.set_ylabel('Average Weight')
PLOT.legend(loc = 'upper left',fontsize = 7)
# train the model
gnb_model = sknb.GaussianNB()
# given weight, predict position
gnb_model.fit(MasterPos['weight'].reshape(len(MasterPos['weight']),1),MasterPos["position"])
y_pred = gnb_model.predict(MasterPos['weight'].reshape(len(MasterPos['weight']),1)) # fix this
MasterPos['predicted_nb'] = y_pred
### skm.accuracy_score(...)
skm.accuracy_score(y_true=MasterPos["position"],y_pred=MasterPos["predicted_nb"])
### Weight alone
dt_model = sktree.DecisionTreeClassifier(max_depth = 2, criterion='entropy')
dt_model.fit(MasterPos[['weight']],MasterPos['position'])
predicted_labels = dt_model.predict(MasterPos[['weight']])
MasterPos['predicted_label_tree'] = predicted_labels
### Weight alone
dot_data = sksix.StringIO()
sktree.export_graphviz(dt_model, out_file=dot_data,
feature_names=['weight'],
filled=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
ipd.Image(graph.create_png())
skm.accuracy_score(y_true=MasterPos["position"],y_pred=MasterPos['predicted_label_tree'])
cfmat = skm.confusion_matrix(
y_true = MasterPos["position"],
y_pred = MasterPos['predicted_label_tree']
)
sns.heatmap(cfmat, annot=True)
### Perhaps Height and Weight ?
dt_model = sktree.DecisionTreeClassifier(max_depth = 2, criterion='entropy')
dt_model.fit(MasterPos[['weight','height']],MasterPos['position'])
predicted_labels = dt_model.predict(MasterPos[['weight','height']])
MasterPos['predicted_label_tree'] = predicted_labels
dot_data = sksix.StringIO()
sktree.export_graphviz(dt_model, out_file=dot_data,
feature_names=['weight','height'],
filled=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
ipd.Image(graph.create_png())
skm.accuracy_score(y_true=MasterPos["position"],y_pred=MasterPos['predicted_label_tree'])
cfmat = skm.confusion_matrix(
y_true = MasterPos["position"],
y_pred = MasterPos['predicted_label_tree']
)
sns.heatmap(cfmat, annot=True)
MasterPos.groupby('predicted_label_tree').sum()
looking at the labels and the confusion matrix we can see that the classifier is guessing
PositionDist = MasterPos.copy()
PositionDist.sort_values(by='position', inplace=True)
hist = Histogram(PositionDist, values='weight', color='position',
title="Weight Distribution by Position", legend='top_right')
show(hist)
# create the table you want batting stats and weight
BatWeight = pd.merge(BattingSel, Weight_df, left_index=True, right_index=True, how='inner')
BatWeight.head(2)
# want players who have had the opportunity to bat @ least 100 times
# even if they got 0's but at least they had the opportunity to preform.
BatWeightH = BatWeight[BatWeight.AB > 100]
BatWeightH.head(2)
batstat = ['AB','R','H','HR','RBI','3B','2B','SO','weight','height']
cormat = BatWeightH[batstat].corr(method='spearman')
cormat
# HR's seems to have the strongest correlation with weight .47
sns.heatmap(cormat,linewidths=.5,annot=True)
sns.set(font_scale=.5)
Perhaps when you weight more you have more power meaning you hit farther, but you don't necessary swing faster ( hard to pull a 3 base hit) or maybe you don't run fast enough to get 3rd base hits?
BatWeightH_2 = BatWeightH.copy()
BatWeightH_2 = BatWeightH_2.reset_index() # do this to be able to merge
Position_df = Position_df.reset_index() # have to do this to merge
Pos_Bats = pd.merge(BatWeightH_2,Position_df,on=['playerID','yearID'])
Pos_Bats = Pos_Bats.set_index('playerID')
Pos_Bats.head(3)
FB = Pos_Bats[Pos_Bats['position'] == "FirstBase"]
SS = Pos_Bats[Pos_Bats['position'] == "ShortStop"]
#let's look at 5 main batting stats & their weight
Big5_W = ['R','HR','RBI','H','SO','weight']
# firstbase
sns.pairplot(FB[Big5_W])
# CONFIRMING AND BETTER VISUALIZING THE CORRELATIONS WE SAW IN SPLOM
cormat = FB[Big5_W].corr(method='spearman') # spearman because vars are skewed
sns.heatmap(cormat,linewidths=.5,annot=True)
sns.set(font_scale=.4)
# shortstop
sns.pairplot(SS[Big5_W])
# again variables look skewed
# CONFIRMING AND BETTER VISUALIZING THE CORRELATIONS WE SAW IN SPLOM
cormat = SS[Big5_W].corr(method='spearman')
sns.heatmap(cormat,linewidths=.5,annot=True)
sns.set(font_scale=.4)
AVGHR = BatWeightH.groupby(BatWeightH.index)['HR'].mean()
BatWeightH.HR.hist()
AVGHR.hist()
# add 1 so we dont take the log 0
qntls, xr = stats.probplot(np.log(BatWeightH.HR+1), fit=False)
sns.regplot(xr,qntls)
qntls, xr = stats.probplot(AVGHR+1, fit=False)
sns.regplot(xr,qntls)
AvgHRByYr = BatWeight.groupby('yearID')[['HR']].mean()
AvgHRByYr = AvgHRByYr.reset_index()
AvgHRByYr.head(2)
from bokeh.layouts import row
TS = TimeSeries(AvgLBTSN_IX,
x= "yearID", y='weight', title="Timeseries", ylabel='Average Weight',xlabel='Year',plot_width=400,color='LightBlue', legend=False)
TS2 = TimeSeries(AvgHRByYr,
x= "yearID", y='HR', title="Timeseries", ylabel='Average HomeRuns',xlabel='Year',plot_width=400,color='LightCoral', legend=False)
show(row(TS,TS2))
AvgHRByYr['yearID'] = AvgHRByYr['yearID'].dt.strftime('%Y')
source1 = ColumnDataSource(AvgHRByYr)
tooltips1 = [('homeruns', '@HR'),
('year', '@yearID')]
hover1 = HoverTool(tooltips=tooltips1)
p1 = figure(plot_width=700,plot_height=450,title="MLB homeruns over the years", x_axis_label='Year',y_axis_label = "Average Homeruns")
# Add the hover tool
p1.add_tools(hover1)
# # Populate glyphs
p1.circle(x='yearID', y='HR', size=7, alpha=0.5, color='LightCoral', source=source1)
show(p1)
Salaries.describe()
Salaries.dtypes
Salaries.head(3)
np.log(Salaries[['salary']] + 1).hist()
logSal = np.log(Salaries[['salary']] + 1)
fourPlot(logSal)
s = BoxPlot(Salaries, values='salary', label='yearID', color='yearID',
title="Salary Box Plot by Year", width = 900, height=900)
show(s)
# Salaries.groupby('yearID').mean()
# # salaries started at 476,299 - 4,396,409
# what about their median
# Salaries.groupby('yearID').median()
# the median is 400,000 - 1,500,000
Salaries['teamID'] = Salaries['teamID'].replace('CHC','CHN')
Teams.head(3)
Teams.columns
correlation matrix? / fix teams
cormat = Teams[['Rank','W','L','R','RA','G','H','HR']].corr(method='spearman')
cormat
Teams['rundiffer'] = (Teams['R'] - Teams['RA']) / Teams.G
Teams['WinPCT'] = Teams['W'] / Teams['L']
Teams20 = Teams[Teams['yearID'] > 1999]
model1 = smf.ols('WinPCT ~ rundiffer', data=Teams20).fit()
model1.summary()
sns.regplot(x='rundiffer', y='WinPCT', data=Teams20)
influence = model1.get_influence()
# Find outliers using DFFITS
number_of_observations = len(Teams20)
number_of_parameters = 2 # parameters include: intercept, x
dffits = influence.dffits[0]
# Use an empirical threshold
dffits_threshold = 2 * np.sqrt(number_of_parameters / number_of_observations)
Teams20[np.abs(dffits) > dffits_threshold]
ax = sns.regplot(np.arange(len(dffits)),dffits,ci=0,fit_reg=False)
plt.hlines(dffits_threshold,0,30)
plt.hlines(-dffits_threshold,0,30)
cleaned_teams20 = Teams20[np.abs(dffits) < dffits_threshold]
model2 = smf.ols('WinPCT ~ rundiffer', data=cleaned_teams20).fit()
model2.summary()
sns.regplot(x='rundiffer', y='WinPCT', data=cleaned_teams20)
sns.plt.title('Regression: WinPCT predicted by Run Differential for 2000-2016 (outliers excluded)')
model3 = smf.ols('WinPCT ~ rundiffer', data=Teams).fit()
model3.summary()
sns.regplot(x='rundiffer', y='WinPCT', data=Teams)
influence = model3.get_influence()
# Find outliers using DFFITS
number_of_observations = len(Teams)
number_of_parameters = 2 # parameters include: intercept, x
dffits = influence.dffits[0]
# Use an empirical threshold
dffits_threshold = 2 * np.sqrt(number_of_parameters / number_of_observations)
Teams[np.abs(dffits) > dffits_threshold]
ax = sns.regplot(np.arange(len(dffits)),dffits,ci=0,fit_reg=False)
plt.hlines(dffits_threshold,0,30)
plt.hlines(-dffits_threshold,0,30)
cleaned_teams = Teams[np.abs(dffits) < dffits_threshold]
model4 = smf.ols('WinPCT ~ rundiffer', data=cleaned_teams).fit()
model4.summary()
sns.regplot(x='rundiffer', y='WinPCT', data=cleaned_teams)
sns.plt.title('Regression: WinPCT predicted by Run Differential for 1871-2016 (outliers excluded)')
# group salary by year id and team
SalbyYrTeam = Salaries[Salaries['yearID'] >= 1985]['salary'].groupby([Salaries['yearID'],Salaries['teamID']]).sum()
# now look at the world series winners
WSW = Teams[(Teams['yearID']>= 1985) & (Teams['WSWin'] == 'Y')][['yearID','teamID','name']]
## itterate through the rows of world series winners, create payroll and pay roll rank
# sal by yr and team has the annual salary : payroll for that year for that team
# for pay roll rank, make sure it's given to the right team id
# (go back and look @ sals some error??)
for index, row in WSW.iterrows():
AnnSals = SalbyYrTeam[row['yearID']]
WSW.set_value(index, 'payroll', AnnSals[row['teamID']])
WSW.set_value(index, 'payroll_rank', AnnSals.rank(ascending=False)[row['teamID']])
WSW
Rank_Wins = WSW['payroll_rank'].value_counts().reset_index()
Rank_Wins.columns = ['payroll_rank', 'winsby_rank']
Rank_Wins
p = Bar(Rank_Wins, 'payroll_rank', values='winsby_rank', title="World Series Win by Pay Roll Rank",legend=False)
show(p)
ct = pd.crosstab(Rank_Wins['winsby_rank'],Rank_Wins['payroll_rank'])
ct.head()
from scipy.stats import chi2_contingency
chi2,p,dof,ex = chi2_contingency(ct)
print('chi2 =', chi2)
print('p-val =', p)
print('degrees of freedom = ', dof)
print('Expected: ')
pd.DataFrame(ex)